Loading Libraries and Data Set
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(readxl)
library(dplyr)
main_df<-read_csv("/Users/abbysommers/Desktop/Summer Research Project/NFL_Data_2011-2023.csv")
## Rows: 416 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): LOCATION, TEAM, CONFERENCE, REGION
## dbl (6): YEAR, WINS, LOSSES, W/L_PERCENT, HIGH_PERCENT, LOW_PERCENT
## num (5): TOTAL_ALLOT, HIGHEST_SALARY, LOWEST_SALARY, HI_LOW_DIFF, AVE_SALARY
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
main_df <- main_df %>%
rename('W.L_PERCENT' = 'W/L_PERCENT' )
main_df$YEAR <- as.character(main_df$YEAR)
Making Separate Data Sets Based on Conf. and Region
nfc_west<-main_df%>%
filter(CONFERENCE=='NFC',REGION=='West')
afc_west<-main_df%>%
filter(CONFERENCE=='AFC',REGION=='West')
nfc_south<-main_df%>%
filter(CONFERENCE=='NFC',REGION=='South')
afc_south<-main_df%>%
filter(CONFERENCE=='AFC',REGION=='South')
nfc_north<-main_df%>%
filter(CONFERENCE=='NFC',REGION=='North')
afc_north<-main_df%>%
filter(CONFERENCE=='AFC',REGION=='North')
nfc_east<-main_df%>%
filter(CONFERENCE=='NFC',REGION=='East')
afc_east<-main_df%>%
filter(CONFERENCE=='AFC',REGION=='East')
head(nfc_west)
## # A tibble: 6 × 15
## LOCATION TEAM YEAR CONFERENCE REGION WINS LOSSES W.L_PERCENT TOTAL_ALLOT
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Arizona Cardina… 2011 NFC West 8 8 0.5 118470364
## 2 Arizona Cardina… 2012 NFC West 5 11 0.312 117797796
## 3 Arizona Cardina… 2013 NFC West 13 3 0.812 120046145
## 4 Arizona Cardina… 2014 NFC West 11 5 0.688 133228945
## 5 Arizona Cardina… 2015 NFC West 13 3 0.812 144707976
## 6 Arizona Cardina… 2016 NFC West 7 9 0.438 155855865
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## # LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(afc_west)
## # A tibble: 6 × 15
## LOCATION TEAM YEAR CONFERENCE REGION WINS LOSSES W.L_PERCENT TOTAL_ALLOT
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Los Angeles Char… 2011 AFC West 8 8 0.5 121746575
## 2 Los Angeles Char… 2012 AFC West 7 9 0.438 125967448
## 3 Los Angeles Char… 2013 AFC West 9 7 0.562 120452285
## 4 Los Angeles Char… 2014 AFC West 9 7 0.562 134236725
## 5 Los Angeles Char… 2015 AFC West 4 12 0.25 140814655
## 6 Los Angeles Char… 2016 AFC West 5 11 0.312 155470367
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## # LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(nfc_east)
## # A tibble: 6 × 15
## LOCATION TEAM YEAR CONFERENCE REGION WINS LOSSES W.L_PERCENT TOTAL_ALLOT
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 New York Giants 2011 NFC East 9 7 0.562 119706771
## 2 New York Giants 2012 NFC East 9 7 0.562 125514604
## 3 New York Giants 2013 NFC East 7 9 0.438 129256255
## 4 New York Giants 2014 NFC East 6 10 0.375 131935198
## 5 New York Giants 2015 NFC East 6 10 0.375 128321422
## 6 New York Giants 2016 NFC East 11 5 0.688 154901229
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## # LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(afc_east)
## # A tibble: 6 × 15
## LOCATION TEAM YEAR CONFERENCE REGION WINS LOSSES W.L_PERCENT TOTAL_ALLOT
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Miami Dolphins 2011 AFC East 6 10 0.375 132817373
## 2 Miami Dolphins 2012 AFC East 7 9 0.438 122292261
## 3 Miami Dolphins 2013 AFC East 8 8 0.5 112354370
## 4 Miami Dolphins 2014 AFC East 8 8 0.5 142455998
## 5 Miami Dolphins 2015 AFC East 6 10 0.375 144878936
## 6 Miami Dolphins 2016 AFC East 10 6 0.625 151387347
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## # LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(nfc_south)
## # A tibble: 6 × 15
## LOCATION TEAM YEAR CONFERENCE REGION WINS LOSSES W.L_PERCENT TOTAL_ALLOT
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Atlanta Falcons 2011 NFC South 10 6 0.625 126014378
## 2 Atlanta Falcons 2012 NFC South 13 3 0.812 121447956
## 3 Atlanta Falcons 2013 NFC South 4 12 0.25 110531645
## 4 Atlanta Falcons 2014 NFC South 6 10 0.375 133009495
## 5 Atlanta Falcons 2015 NFC South 8 8 0.5 136236065
## 6 Atlanta Falcons 2016 NFC South 11 5 0.688 153820903
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## # LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(afc_south)
## # A tibble: 6 × 15
## LOCATION TEAM YEAR CONFERENCE REGION WINS LOSSES W.L_PERCENT TOTAL_ALLOT
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Jacksonvil… Jagu… 2011 AFC South 5 11 0.312 92775655
## 2 Jacksonvil… Jagu… 2012 AFC South 2 14 0.125 129514023
## 3 Jacksonvil… Jagu… 2013 AFC South 4 12 0.25 120190441
## 4 Jacksonvil… Jagu… 2014 AFC South 3 13 0.188 129184374
## 5 Jacksonvil… Jagu… 2015 AFC South 5 11 0.312 136130404
## 6 Jacksonvil… Jagu… 2016 AFC South 3 13 0.188 150459217
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## # LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(nfc_north)
## # A tibble: 6 × 15
## LOCATION TEAM YEAR CONFERENCE REGION WINS LOSSES W.L_PERCENT TOTAL_ALLOT
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Chicago Bears 2011 NFC North 8 8 0.5 107756643
## 2 Chicago Bears 2012 NFC North 10 6 0.625 128921164
## 3 Chicago Bears 2013 NFC North 8 8 0.5 127169835
## 4 Chicago Bears 2014 NFC North 5 11 0.312 131432644
## 5 Chicago Bears 2015 NFC North 6 10 0.375 141512728
## 6 Chicago Bears 2016 NFC North 3 13 0.188 148323553
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## # LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
head(afc_north)
## # A tibble: 6 × 15
## LOCATION TEAM YEAR CONFERENCE REGION WINS LOSSES W.L_PERCENT TOTAL_ALLOT
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cleaveland Browns 2011 AFC North 4 12 0.25 103839520
## 2 Cleaveland Browns 2012 AFC North 5 11 0.312 127350645
## 3 Cleaveland Browns 2013 AFC North 4 12 0.25 110066758
## 4 Cleaveland Browns 2014 AFC North 7 9 0.438 135401214
## 5 Cleaveland Browns 2015 AFC North 3 13 0.188 139647560
## 6 Cleaveland Browns 2016 AFC North 1 15 0.0625 130222141
## # ℹ 6 more variables: HIGHEST_SALARY <dbl>, HIGH_PERCENT <dbl>,
## # LOWEST_SALARY <dbl>, LOW_PERCENT <dbl>, HI_LOW_DIFF <dbl>, AVE_SALARY <dbl>
afc_east$loc<-"AFC East"
nfc_east$loc<-"NFC East"
afc_west$loc<-"AFC West"
nfc_west$loc<-"NFC West"
nfc_north$loc<-"NFC North"
nfc_south$loc<-"NFC South"
afc_south$loc<-"AFC South"
afc_north$loc<-"AFC North"
main_df <- rbind(afc_east,nfc_east,afc_west,nfc_west,afc_north,nfc_north,afc_south,nfc_south)
#View(main_df)
Preliminary Thoughts
For studying the relationship between win-loss percentages and average salaries over a span of 13 years of NFL data can be an intriguing and complex analysis. Here are some preliminary thoughts and considerations before we dive into this study:
Thoughts-Predictions
Preliminary thoughts can help guide data analysis in terms of formulating certain hypotheses that could answer the main questions that agencies would want to know.
Considerations
Steps to Analyze
Descriptive Statistics
Summary Statistics: Calculating means, medians, modes, standard deviations, and ranges for discrete and continuous variables. Frequency Distributions: Counting the frequency and percentages of categorical variables.
Data Cleaning
The data set was manually created by members of the research group to avoid missing values and outliers. It was made sure that the complete data set of all NFL data was formatted correctly and could easily be analyzed in a programming language. The main data frame is called ‘main_df’. The main data frame may be broken down into subsets if necessary for examining a specific part of the data.
Exploratory Data Analysis
Data Visualization - Categorical Data: Bar Graphs - Quantitative Data: Histograms and Scatter Plots
WINS
ggplot(main_df, aes(x = WINS)) +
geom_histogram(binwidth=0.5,fill = "cornflowerblue") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(y="Frequency",title="Histogram of NFL Wins") +
scale_x_continuous(breaks=c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15))+scale_y_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60))
The distribution of NFL wins is roughly symmetrical, with most teams have seven wins each year.
LOSSES
ggplot(main_df, aes(x = LOSSES)) +
geom_histogram(binwidth=0.5,fill = "red3") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(y="Frequency",title="Histogram of NFL Losses") +
scale_x_continuous(breaks=c(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16))+scale_y_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60))
The distribution of NFL losses is slightly symmetrical, with most teams having nine losses.
WIN.LOSS_PERCENT
ggplot(main_df, aes(x = W.L_PERCENT)) +
geom_histogram(binwidth=0.05,fill = "mediumpurple4") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x="Win_Loss Percentage",y="Frequency",title="Histogram of Win-Loss Percentages")+scale_y_continuous(breaks=c(0,5,10,15,20,25,30,35,40,45,50,55,60))
The distribution of NFL win-loss percentages is roughly symmetrical, with most teams having an approximate .500 percentage.
TOTAL ALLOCATION
ggplot(main_df, aes(x = TOTAL_ALLOT)) +
geom_histogram(fill = "khaki2") +
labs(x="Total Allocation (in millions)",y="Frequency",title="Histogram of Total Salary Allocation")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Total salary allocation (continous variable) is difficult to graph as this number is different across all teams and years. Total allocation varies depending on numerous factors, giving us a non-normal or symmetrical histogram.
Violin Plot - Total Allocation
AFC<-main_df%>%
filter(CONFERENCE=="AFC")
ggplot(AFC, aes(x = loc, y = TOTAL_ALLOT, fill = loc)) +
geom_violin() +
labs(x = "Affliation", y = "Total Allocation", title = "Violin Plot of Total Allocation by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
NFC<-main_df%>%
filter(CONFERENCE=="NFC")
ggplot(NFC, aes(x = loc, y = TOTAL_ALLOT, fill = loc)) +
geom_violin() +
labs(x = "Affliation", y = "Total Allocation", title = "Violin Plot of Total Allocation by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
HIGHEST SALARY
ggplot(main_df, aes(x = HIGHEST_SALARY)) +
geom_histogram(fill = "sandybrown") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x="Highest Salary (in millions)",y="Frequency",title="Histogram of Highest Salary")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The highest salary (continuous variable) distribution shows a slightly skewed to right graph. Looking at the histogram, most highest salaries are located mid-way between \(10,000,000\) and \(20,000,000\) dollars.
Violin Plot - Highest Salary
ggplot(AFC, aes(x = loc, y = HIGHEST_SALARY, fill = loc)) +
geom_violin() +
labs(x = "Affliation", y = "Highest Salary (in millions)", title = "Violin Plot of Highest Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
ggplot(NFC, aes(x = loc, y = HIGHEST_SALARY, fill = loc)) +
geom_violin() +
labs(x = "Affliation", y = "Highest Salary (in millions)", title = "Violin Plot of Highest Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
LOWEST SALARY
ggplot(main_df, aes(x = LOWEST_SALARY)) +
geom_histogram(fill = "aquamarine3") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x="Lowest Salary (in millions)",y="Frequency",title="Histogram of Lowest Salaries")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Violin Plot - Lowest Salary
ggplot(AFC, aes(x = loc, y = LOWEST_SALARY, fill = loc)) +
geom_violin() +
labs(x = "Affliation", y = "Lowest Salary (in millions)", title = "Violin Plot of Lowest Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
ggplot(NFC, aes(x = loc, y = LOWEST_SALARY, fill = loc)) +
geom_violin() +
labs(x = "Affliation", y = "Lowest Salary (in millions)", title = "Violin Plot of Lowest Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
HIGH LOW DIFFERENCE (The difference between the highest and lowest salary)
ggplot(main_df, aes(x = HI_LOW_DIFF)) +
geom_histogram(fill = "honeydew4") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x="Difference Between Highest and Lowest Salary",y="Frequency",title="Histogram")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
AVERAGE SALARY
ggplot(main_df, aes(x = AVE_SALARY)) +
geom_histogram(fill = "cyan2") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x="Average Salary (in millions)",y="Frequency",title="Histogram of Average Salary")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For most teams, the average salary ranges from \(5,000,000\) to \(10,00,000\) dollars.
Violin Plot - Average Salary By Region
ggplot(AFC, aes(x = loc, y = AVE_SALARY, fill = loc)) +
geom_violin() +
labs(x = "Affliation", y = "Average Salary (in millions)", title = "Violin Plot of Average Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
ggplot(NFC, aes(x = loc, y = AVE_SALARY, fill = loc)) +
geom_violin() +
labs(x = "Affliation", y = "Average Salary (in millions)", title = "Violin Plot of Average Salary by Affliation",fill="Affliation")+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
Bar Graphs - NFC
ggplot(nfc_east, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
geom_bar(stat = "identity") +
labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The NFC East") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
ggplot(nfc_west, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
geom_bar(stat = "identity") +
labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The NFC West") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
ggplot(nfc_south, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
geom_bar(stat = "identity") +
labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The NFC South") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
ggplot(nfc_north, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
geom_bar(stat = "identity") +
labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The NFC North") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
Bar Graphs - AFC
ggplot(afc_east, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
geom_bar(stat = "identity") +
labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The AFC East") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
ggplot(afc_west, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
geom_bar(stat = "identity") +
labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The AFC West") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
ggplot(afc_north, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
geom_bar(stat = "identity") +
labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The AFC North") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
ggplot(afc_south, aes(x = TEAM, y = AVE_SALARY, fill = TEAM)) +
geom_bar(stat = "identity") +
labs(x = "Team", y = "Average Salary (in millions)", title = "Average Salary by Teams In The AFC South") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+scale_y_continuous(labels=function(y) format(y,scientific=FALSE))
Initial Look at the Relationship Between Win-Loss Percentage and Average Salary
main_df$YEAR<-as.character(main_df$YEAR)
ggplot(data = AFC, aes(x = AVE_SALARY, y = W.L_PERCENT,color=loc)) +
geom_point(size = 2.5) +
geom_smooth(method = lm, se = FALSE, linetype = "solid") +
labs(
x = "Average Salary (in millions)",
y = "Win-Loss Percentage",
title = "Win-Loss Percentage vs Average Salary in the AFC Conference"
) +
scale_x_continuous(
labels = function(x) format(x, scientific = FALSE),
) +
facet_wrap(~ loc)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = NFC, aes(x = AVE_SALARY, y = W.L_PERCENT,color=loc)) +
geom_point(size = 2.5) +
geom_smooth(method = lm, se = FALSE, linetype = "solid") +
labs(
x = "Average Salary (in millions)",
y = "Win-Loss Percentage",
title = "Win-Loss Percentage vs Average Salary in the NFC Conference"
) + scale_x_continuous(
labels = function(x) format(x, scientific = FALSE)
) +
facet_wrap(~ loc)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = main_df, aes(x = AVE_SALARY, y = W.L_PERCENT)) +
geom_point(size = 2.5) +
geom_smooth(method=lm,se=FALSE,linetype="solid")+
labs(x = "Average Salary", y = "Win-Loss Percentage", title = "Win-Loss Percentage vs Average Salary by Region")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))+scale_x_continuous(breaks=c(0,2500000,5000000,7500000,10000000,12500000,15000000,17500000,20000000))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
Initial Look at the Relationship Between Win-Loss Percentage and Average Salary (grouped by year)
ggplot(data = main_df, aes(x = AVE_SALARY, y = W.L_PERCENT,color=YEAR)) +
geom_point(size = 2.5) +
geom_smooth(method=lm,se=FALSE,linetype="solid",color="black")+
labs(x = "Average Salary", y = "Win-Loss Percentage", title = "Win-Loss Percentage vs Average Salary by Year")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))+scale_x_continuous(breaks=c(0,2500000,5000000,7500000,10000000,12500000,15000000,17500000,20000000))+facet_wrap(~ YEAR)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
nfl_2011_2016<-main_df%>%
filter(YEAR<2017)
ggplot(data = nfl_2011_2016, aes(x = AVE_SALARY, y = W.L_PERCENT,color=loc)) +
geom_point(size = 2.5) +
geom_smooth(method=lm,se=FALSE,linetype="solid",color="black")+
labs(x = "Average Salary (in millions)", y = "Win-Loss Percentage", title = "Win-Loss Percentage vs Average Salary by Year")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))+scale_x_continuous(breaks=c(3000000,5000000,7500000,10000000,12500000,15000000,17500000,20000000))+facet_wrap(~ YEAR)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
nfl_2017_2020<-main_df%>%
filter(YEAR>2016,YEAR<2021)
ggplot(data = nfl_2017_2020, aes(x = AVE_SALARY, y = W.L_PERCENT,color=YEAR)) +
geom_point(size = 2.5) +
geom_smooth(method=lm,se=FALSE,linetype="solid",color="black")+
labs(x = "Average Salary", y = "Win-Loss Percentage", title = "Win-Loss Percentage vs Average Salary by Year")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))+scale_x_continuous(breaks=c(0,2500000,5000000,7500000,10000000,12500000,15000000,17500000,20000000))+facet_wrap(~ YEAR)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
nfl_2021_2023<-main_df%>%
filter(YEAR>2020)
ggplot(data = nfl_2021_2023, aes(x = AVE_SALARY, y = W.L_PERCENT,color=YEAR)) +
geom_point(size = 2.5) +
geom_smooth(method=lm,se=FALSE,linetype="solid",color="black")+
labs(x = "Average Salary", y = "Win-Loss Percentage", title = "Win-Loss Percentage vs Average Salary by Year")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))+scale_x_continuous(breaks=c(0,2500000,5000000,7500000,10000000,12500000,15000000,17500000,20000000))+facet_wrap(~ YEAR)+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
## `geom_smooth()` using formula = 'y ~ x'
Wins vs Total Allocation AFC
ggplot(data = AFC, aes(x = TOTAL_ALLOT, y = WINS,color=loc)) +
geom_point(size = 2.5) +
geom_smooth(method=lm,se=FALSE,color='black',linetype="solid")+
labs(x = "Total Allocation", y = "Wins", title = "Wins vs Total Allocation in the AFC")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `geom_smooth()` using formula = 'y ~ x'
Wins vs Total Allocation NFC
ggplot(data = NFC, aes(x = TOTAL_ALLOT, y = WINS,color=loc)) +
geom_point(size = 2.5) +
geom_smooth(method=lm,se=FALSE,color='black',linetype="solid")+
labs(x = "Total Allocation", y = "Wins", title = "Wins vs Total Allocation in the NFC")+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `geom_smooth()` using formula = 'y ~ x'
Summary Statistics
Calculating means, medians, modes, standard deviations, and ranges for discrete and continuous variables.
Discrete variables: Wins, Losses, Win-Loss Percentage, Highest Salary Percent, Lowest Salary Percent
cat("Wins:\n")
## Wins:
summary(main_df$WINS)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 6.000 8.000 8.079 10.000 15.000
cat("Losses:\n")
## Losses:
summary(main_df$LOSSES)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 6.000 8.000 7.969 10.000 16.000
cat("High Percent:\n")
## High Percent:
summary(main_df$HIGH_PERCENT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04286 0.08000 0.09846 0.10200 0.12130 0.18564
cat("Low Percent:\n")
## Low Percent:
summary(main_df$LOW_PERCENT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000100 0.0001900 0.0002750 0.0005283 0.0005900 0.0036200
cat("Win-Loss Percentage:\n")
## Win-Loss Percentage:
summary(main_df$W.L_PERCENT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.3750 0.5000 0.5031 0.6471 0.9375
carolina_2015<-main_df%>%
filter(YEAR==2011)
Continuous variables: Total Allocation, Highest Salary, Lowest Salary, High and Low Salary Difference, Average Salary
cat("Total Allocation:\n")
## Total Allocation:
summary(main_df$TOTAL_ALLOT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 87714299 132019471 161688340 163999166 194275455 232771942
cat("Highest Salary:\n")
## Highest Salary:
summary(main_df$HIGHEST_SALARY)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5610000 12300000 15850342 16627399 19991750 37133825
cat("Lowest Salary:\n")
## Lowest Salary:
summary(main_df$LOWEST_SALARY)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 667 29118 49080 80339 96225 626664
cat("High and Low Salary Difference:\n")
## High and Low Salary Difference:
summary(main_df$HI_LOW_DIFF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5504118 12248240 15752550 16547060 19925657 36977159
Average Salary Variable
cat("Summary Statistics for Average Salary\n")
## Summary Statistics for Average Salary
summary(main_df$AVE_SALARY)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2857941 6167776 7958974 8353869 10013264 18645246
cat("Standard Deviation:\n")
## Standard Deviation:
sd(main_df$AVE_SALARY)
## [1] 2889497
cat("Variance:\n")
## Variance:
var(main_df$AVE_SALARY)
## [1] 8.349196e+12
The variance for average salary is a large number, indicating a large dispersion in the data.
Specifics
mean(AFC$AVE_SALARY)
## [1] 7977605
mean(NFC$AVE_SALARY)
## [1] 8730133
mean(AFC$WINS)
## [1] 8.019231
mean(NFC$WINS)
## [1] 8.139423
mean(afc_east$AVE_SALARY)
## [1] 6992129
mean(afc_east$WINS)
## [1] 8.346154
mean(afc_west$AVE_SALARY)
## [1] 9101711
mean(afc_west$WINS)
## [1] 8.307692
Hypothesis Testing
MAIN QUESTION: We want to understand the relationship between average salaries and win-loss percentages.
Test Assumptions: 1. Random Sampling: Sample is representative of the population.
Normality of Data: Assumes that the data follows a normal distribution, which is important for parametric tests such as t-tests and ANOVA. If the data is not normally distributed, non-parametric tests might be more appropriate.
Independence of Observations: The observations in the sample should be independent of each other. The value of one observation should not influence the value of another.
Homogeneity of Variance: For tests such as ANOVA that compare variances between groups, an assumption is often made that the variances within each group are roughly equal (homogeneity of variances).
Measurement Scale: The data should be measured on an appropriate scale. For example, if using a t-test, the data should be at least on an interval scale. If using a chi-square test, the data should be categorical.
Null Hypothesis Structure: The null hypothesis should be precise and testable.
Sample Size: Larger sample sizes tend to provide more reliable results and increase the power of statistical tests.
research<-main_df%>%
select(W.L_PERCENT,AVE_SALARY)
summary(research)
## W.L_PERCENT AVE_SALARY
## Min. :0.0000 Min. : 2857941
## 1st Qu.:0.3750 1st Qu.: 6167776
## Median :0.5000 Median : 7958974
## Mean :0.5031 Mean : 8353869
## 3rd Qu.:0.6471 3rd Qu.:10013264
## Max. :0.9375 Max. :18645246
ggplot(research, aes(x = AVE_SALARY, y = W.L_PERCENT)) +
geom_point() +
labs(title = "Relationship Between Average Salary and Winning Percentage",
x = "Average Salary",
y = "Winning Percentage") +
theme_minimal()+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
Correlation Analysis
cor(research$AVE_SALARY, research$W.L_PERCENT, use = "complete.obs")
## [1] 0.1617668
cor(main_df$HIGHEST_SALARY, main_df$W.L_PERCENT, use = "complete.obs")
## [1] 0.1578862
cor(main_df$TOTAL_ALLOT, main_df$W.L_PERCENT, use = "complete.obs")
## [1] 0.06034338
Linear Regression
model <- lm(W.L_PERCENT ~ AVE_SALARY, data = research)
summary(model)
##
## Call:
## lm(formula = W.L_PERCENT ~ AVE_SALARY, data = research)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47923 -0.13403 0.00519 0.14523 0.47567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.132e-01 2.851e-02 14.492 < 2e-16 ***
## AVE_SALARY 1.076e-08 3.226e-09 3.335 0.000929 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1899 on 414 degrees of freedom
## Multiple R-squared: 0.02617, Adjusted R-squared: 0.02382
## F-statistic: 11.12 on 1 and 414 DF, p-value: 0.0009286
# regression line and the scatter plot
ggplot(research, aes(x = AVE_SALARY, y = W.L_PERCENT)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Relationship Between Average Salary and Winning Percentage with Regression Line",
x = "Average Salary",
y = "Winning Percentage") +
theme_minimal()+scale_x_continuous(labels=function(x) format(x,scientific=FALSE))
## `geom_smooth()` using formula = 'y ~ x'
Plotting residuals
plot(model$residuals)
hist(model$residuals)
Checking linearity
plot(model$fitted.values, model$residuals,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Residuals vs. Fitted Values")
abline(h = 0, col = "red")
Linearity assumption is reasonable
Added Variable plots
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
avPlots(model)
plot(main_df$AVE_SALARY, main_df$W.L_PERCENT,
xlab = "Average Salary",
ylab = "Winning Percentage",
main = "Scatterplot of Average Salary vs. Winning Percentage")
abline(model, col = "blue")
Doing Poly Model instead
poly_model <- lm(W.L_PERCENT ~ poly(AVE_SALARY, 2), data = main_df)
summary(poly_model)
##
## Call:
## lm(formula = W.L_PERCENT ~ poly(AVE_SALARY, 2), data = main_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.47850 -0.13659 0.00275 0.14416 0.48200
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.503132 0.009319 53.989 < 2e-16 ***
## poly(AVE_SALARY, 2)1 0.633418 0.190075 3.332 0.000938 ***
## poly(AVE_SALARY, 2)2 -0.099206 0.190075 -0.522 0.601999
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1901 on 413 degrees of freedom
## Multiple R-squared: 0.02681, Adjusted R-squared: 0.0221
## F-statistic: 5.689 on 2 and 413 DF, p-value: 0.003654
ggplot(data = data.frame(fitted = poly_model$fitted.values, residuals = poly_model$residuals),
aes(x = fitted, y = residuals)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. Fitted Values", x = "Fitted Values", y = "Residuals") +
theme_minimal()
Times Series Analysis
ggplot(main_df, aes(x = YEAR, y = W.L_PERCENT)) +
geom_line() +
labs(title = "Winning Percentage Over Time",
x = "Year",
y = "Winning Percentage") +
theme_minimal()
# Plot Average Salary over Time
ggplot(main_df, aes(x = YEAR, y = AVE_SALARY)) +
geom_line() +
labs(title = "Average Salary Over Time",
x = "Year",
y = "Average Salary (in millions)") +
theme_minimal()+scale_y_continuous(labels=function(x) format(x,scientific=FALSE))
Interpretation and Reporting
\[Y_i = \beta_0 + \beta_1X_i+ \epsilon_i\]
\[Y_i: Win Percentage\]
\[X_i: Average Salary\]